%run set_theme.ipynb
import os
from plotly.offline import init_notebook_mode
import plotly.colors as pc
import plotly.express as px
import numpy as np
import pandas as pd
import numpy as np
init_notebook_mode()
RECALCULATE = False
PRINT = False
if not os.path.isfile('cache/gender_devtype.pq') or RECALCULATE:
full_df = pd.read_parquet('../data/SO_2014_2022.pq')
devtypes = list(full_df['DevType'].dropna().apply(lambda s: s.split(';')[0]).unique())
dt_series = full_df.DevType.apply(lambda s: s.split(';') if isinstance(s, str) else np.nan)
dt_mean_sal = pd.DataFrame(columns=['Salary'], index=devtypes)
dt_gender_count = pd.DataFrame(columns=['Male', 'Female'], index=devtypes)
dt_gender_sal = pd.DataFrame(columns=['Male', 'Female'], index=devtypes)
devtype = None
# Loop closure
def filter_devtype(types: list, value):
if devtype not in types:
return np.nan
return value
search_mean_sal = pd.concat([dt_series, full_df.Salary], axis=1).dropna()
search_gender_count = pd.concat([dt_series, full_df.Gender], axis=1).dropna()
search_gender_sal = pd.concat([dt_series, full_df.Salary, full_df.Gender], axis=1).dropna()
for progress, devtype in enumerate(devtypes):
if PRINT:
print(f'{progress / len(devtypes):>4.0%}')
dt_mean_sal.loc[devtype] = search_mean_sal.DevType.combine(search_mean_sal.Salary, filter_devtype).mean()
dt_gender_count.loc[devtype] = tuple(
search_gender_count.DevType.combine(search_gender_count.Gender, filter_devtype).value_counts())
for gender in ['Male', 'Female']:
sgs = search_gender_sal.query(f"Gender == '{gender.lower()}'")
dt_gender_sal.loc[devtype][gender] = sgs.DevType.combine(sgs.Salary, filter_devtype).mean()
if PRINT:
print('100%')
df = pd.DataFrame()
df['Salary'] = dt_mean_sal['Salary']
df['MaleCount'] = dt_gender_count['Male']
df['FemaleCount'] = dt_gender_count['Female']
df['MaleSalary'] = dt_gender_sal['Male']
df['FemaleSalary'] = dt_gender_sal['Female']
df['Gap'] = (df['MaleSalary'] - df['FemaleSalary']) / df['Salary'] * 100
df['Prop'] = df['MaleCount'] / df['FemaleCount']
# Some positions might be unreliable due to low female respondent count
df.query('MaleCount > 200 & FemaleCount > 200', inplace=True)
df.to_parquet('cache/gender_devtype.pq')
RECALCULATE = False
else:
df = pd.read_parquet('cache/gender_devtype.pq')
if PRINT:
print('Loaded from cache')
df.head()
| Salary | MaleCount | FemaleCount | MaleSalary | FemaleSalary | Gap | Prop | |
|---|---|---|---|---|---|---|---|
| Data scientist or machine learning specialist | 66741.062337 | 27967 | 2135 | 67065.807725 | 65158.192406 | 2.858233 | 13.099297 |
| Engineer, data | 74134.871049 | 15341 | 801 | 73851.015016 | 75325.595661 | -1.989051 | 19.152310 |
| Developer, front-end | 57962.401026 | 95441 | 7128 | 58357.289909 | 56208.281881 | 3.707590 | 13.389590 |
| Student | 24490.338723 | 34176 | 2740 | 24634.515929 | 24305.919094 | 1.341741 | 12.472993 |
| Developer, full-stack | 62865.299330 | 182772 | 9983 | 63024.234952 | 63244.182031 | -0.349870 | 18.308324 |
if PRINT:
print('Positions with most female favourable pay gap:', *df.sort_values(by='Gap').head(3).index.tolist(),
sep='\n\t')
print('\nPositions with most male favourable pay gap:',
*df.sort_values(by='Gap', ascending=False).head(3).index.tolist(), sep='\n\t')
print('\nPositions with highest female proportion:',
*df.sort_values(by='Prop', ascending=False).head(3).index.tolist(), sep='\n\t')
print('\nPositions with lowest female proportion:', *df.sort_values(by='Prop').head(3).index.tolist(), sep='\n\t')
fig = px.scatter(
df.reset_index(names='Position'),
x='Prop',
y='Gap',
hover_data={'Position':True, 'Prop':False, 'Gap':False},
labels={
'Prop':'Average number of men per woman in position',
'Gap':'Percentage higher payment of men',
},
title='Gender Ratio of Job Positions vs Pay Gap<br><sup>Positions with more female developers seem to suffer from greater payment inequality</sup>',
trendline='ols',
trendline_color_override='#343a42',
color='Prop',
color_continuous_scale=pc.make_colorscale(['#f854ee', '#4c75eb']),
width=790,
size=df.MaleCount.apply(np.sqrt),
size_max=16
)
fig.update_layout(
coloraxis_showscale=False,
margin={'b': 170, 't': 100, 'l': 90},
)
for trace in fig.data:
trace.hoverlabel = {'font_color': 'white', 'bordercolor': 'white'}
if 'trendline' in trace.hovertemplate:
trace.hovertemplate = 'At %{x:d} men per woman, the estimated pay gap would be %{y:.1f}%<extra></extra>'
else:
trace.hovertemplate = 'Position: %{customdata[0]}<extra></extra>'
# Caption with explanation.
fig.add_annotation(x=-0.09, y=-0.55,
xref='paper', yref='paper',
showarrow=False,
xanchor='left',
yanchor='bottom',
align='left',
text='Every position is displayed as a single dot, where larger dots indicate positions with more respondents. The x-axis<br>' +
'shows the gender ratio, so dots towards the left (pink) have more women (a value of 10 means ten men per woman).<br>' +
'The y-axis shows the pay gap, with positive values favouring men and negative values favouring women.<br>' +
'Hover over a dot to see the corresponding position, or over the trendline to see the estimated pay gap.')
fig.show()